1 Introduction

In previous notebooks and scripts, we integrated our dataset with the one of Hamish et al., and merge it with the RNA fraction of the multiome dataset. Here, we will correct for batch effects with Harmony, and visualize the intermixing of different potential confounders pre- and post-integration.

In addition, we will save the dimensionality reduction (PCA) matrices before and after integration to further quantify the effect of the aforementioned confounders in high dimensional space.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(SeuratWrappers)
library(harmony)
library(tidyverse)

2.2 Parameters

# Paths
path_to_obj <- here::here("scRNA-seq/results/R_objects/tonsil_rna_merged_with_multiome_missing_integration.rds")
# path_to_obj <- here::here("scRNA-seq/results/R_objects/tonsil_rna_merged_with_multiome_missing_integration_downsampled.rds")
path_to_save_obj <- here::here("scRNA-seq/results/R_objects/tonsil_rna_integrated_with_multiome.rds")
path_tmp_dir <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/3-data_integration_multiome/tmp/")
path_to_save_dimred_uncorrect <- str_c(path_tmp_dir, "batch_uncorrected_pca.rds", sep = "")
path_to_save_dimred_correct <- str_c(path_tmp_dir, "batch_corrected_pca.rds", sep = "")
path_to_save_confounders_df <- str_c(path_tmp_dir, "confounders_df.rds", sep = "") 
path_to_save_supp_df <- here::here("data/raw_data_figures/umaps_rna_batch_correction.csv")

2.3 Load data

tonsil <- readRDS(path_to_obj)
tonsil
## An object of class Seurat 
## 37378 features across 362395 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)

3 Visualize UMAP without batch effect correction

Since we noticed that the 3 matrices (scRNA-seq matrices, King et al., and multiome) have different sets of features (genes), we will find assay-specific variable genes. Then, we will use the intersection to improve the batch-effect correction.

# Find shared highly variable genes (HGV)
tonsil_list <- SplitObject(tonsil, split.by = "assay")
tonsil_list <- purrr::map(tonsil_list, FindVariableFeatures, nfeatures = 5000)
hvg <- purrr::map(tonsil_list, VariableFeatures)
shared_hvg <- Reduce(intersect, hvg)
print(length(shared_hvg))
## [1] 1740
print(class(shared_hvg))
## [1] "character"
print(head(shared_hvg))
## [1] "HBA2"  "HBA1"  "IGHM"  "IGHD"  "FDCSP" "CLU"
print(tail(shared_hvg))
## [1] "BAIAP2L1"  "COL19A1"   "FAM110B"   "LINC00402" "MPDZ"      "LINC02470"
rm(tonsil_list)


# Process Seurat object
tonsil <- tonsil%>% 
  NormalizeData(normalization.method = "LogNormalize", scale.factor = 1e4) %>%
  ScaleData(features = shared_hvg) %>% 
  RunPCA(features = shared_hvg) %>%
  RunUMAP(reduction = "pca", dims = 1:30)


# Visualize UMAP
confounders <- c("library_name", "sex", "age_group", "is_hashed",
                 "hospital", "assay")
umaps_before_integration <- purrr::map(confounders, function(x) {
  p <- DimPlot(tonsil, group.by = x, pt.size = 0.1)
  p
})
names(umaps_before_integration) <- confounders
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
umaps_before_integration$library_name + NoLegend()

print("UMAP colored by sex, age group, cell hashing status, sampling center and assay:")
## [1] "UMAP colored by sex, age group, cell hashing status, sampling center and assay:"
umaps_before_integration[2:length(umaps_before_integration)]
## $sex

## 
## $age_group

## 
## $is_hashed

## 
## $hospital

## 
## $assay

# Dataframe to generate supplementary figures
unintegrated_df <- data.frame(
  barcode = colnames(tonsil),
  assay = tonsil$assay,
  UMAP_1 = tonsil@reductions$umap@cell.embeddings[, "UMAP_1"],
  UMAP_2 = tonsil@reductions$umap@cell.embeddings[, "UMAP_2"],
  annotation_king = tonsil$cell_type
)
unintegrated_df$is_integrated <- "unintegrated"

4 Run and visualize Harmony’s integration

tonsil <- tonsil %>%
  RunHarmony(reduction = "pca", dims = 1:30, group.by.vars = "gem_id") %>%
  RunUMAP(reduction = "harmony", dims = 1:30)
umaps_after_integration <- purrr::map(confounders, function(x) {
  p <- DimPlot(tonsil, group.by = x, pt.size = 0.1)
  p
})
names(umaps_after_integration) <- confounders
print("UMAP colored by GEM:")
## [1] "UMAP colored by GEM:"
umaps_after_integration$library_name + NoLegend()

print("UMAP colored by sex, age group, cell hashing status, sampling center and assay:")
## [1] "UMAP colored by sex, age group, cell hashing status, sampling center and assay:"
umaps_after_integration[2:length(umaps_before_integration)]
## $sex

## 
## $age_group

## 
## $is_hashed

## 
## $hospital

## 
## $assay

# Dataframe to generate supplementary figures
integrated_df <- data.frame(
  barcode = colnames(tonsil),
  assay = tonsil$assay,
  UMAP_1 = tonsil@reductions$umap@cell.embeddings[, "UMAP_1"],
  UMAP_2 = tonsil@reductions$umap@cell.embeddings[, "UMAP_2"],
  annotation_king = tonsil$cell_type
)
integrated_df$is_integrated <- "integrated"
supp_df <- bind_rows(list(unintegrated_df, integrated_df))

5 Save

# If it doesn't exist create temporal directory
dir.create(path_tmp_dir, showWarnings = FALSE) 


# Save integrated Seurat object
saveRDS(tonsil, path_to_save_obj)


# Save PCA matrices to compute the Local Inverse Simpson Index (LISI)
confounders_df <- tonsil@meta.data[, confounders]
saveRDS(confounders_df, path_to_save_confounders_df)
saveRDS(
  tonsil@reductions$pca@cell.embeddings[, 1:30],
  path_to_save_dimred_uncorrect
)
saveRDS(
  tonsil@reductions$harmony@cell.embeddings[, 1:30],
  path_to_save_dimred_correct
)


# Save dataframe to plot supplementary
write_delim(supp_df, path_to_save_supp_df, delim = ";", col_names = TRUE)

6 Session Information

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
## 
## Matrix products: default
## BLAS:   /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0        stringr_1.4.0        dplyr_1.0.4          purrr_0.3.4          readr_1.3.1          tidyr_1.1.0          tibble_3.0.1         ggplot2_3.3.0        tidyverse_1.3.0      harmony_1.0          Rcpp_1.0.6           SeuratWrappers_0.2.0 Seurat_3.2.0         BiocStyle_2.14.4    
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_1.4-1      deldir_0.1-25         ellipsis_0.3.1        ggridges_0.5.2        rprojroot_1.3-2       fs_1.4.1              rstudioapi_0.11       spatstat.data_1.4-3   farver_2.0.3          leiden_0.3.3          listenv_0.8.0         remotes_2.2.0         ggrepel_0.8.2         RSpectra_0.16-0       fansi_0.4.1           lubridate_1.7.8       xml2_1.3.2            codetools_0.2-16      splines_3.6.0         knitr_1.28            polyclip_1.10-0       jsonlite_1.7.2        broom_0.5.6           ica_1.0-2             cluster_2.1.0         dbplyr_1.4.4          png_0.1-7             uwot_0.1.8            shiny_1.4.0.2         sctransform_0.2.1     BiocManager_1.30.10   compiler_3.6.0        httr_1.4.2            backports_1.1.7       assertthat_0.2.1      Matrix_1.2-18         fastmap_1.0.1         lazyeval_0.2.2        cli_2.0.2             later_1.0.0           htmltools_0.5.1.1     tools_3.6.0           rsvd_1.0.3            igraph_1.2.5          gtable_0.3.0          glue_1.4.1            RANN_2.6.1            reshape2_1.4.4        rappdirs_0.3.1        spatstat_1.64-1       cellranger_1.1.0      vctrs_0.3.6           ape_5.3              
##  [55] nlme_3.1-148          lmtest_0.9-37         xfun_0.14             globals_0.12.5        rvest_0.3.5           mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0       irlba_2.3.3           goftest_1.2-2         future_1.17.0         MASS_7.3-51.6         zoo_1.8-8             scales_1.1.1          hms_0.5.3             promises_1.1.0        spatstat.utils_1.17-0 parallel_3.6.0        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.16       pbapply_1.4-2         gridExtra_2.3         rpart_4.1-15          stringi_1.4.6         rlang_0.4.10          pkgconfig_2.0.3       evaluate_0.14         lattice_0.20-41       ROCR_1.0-11           tensor_1.5            labeling_0.3          patchwork_1.0.0       htmlwidgets_1.5.1     cowplot_1.0.0         tidyselect_1.1.0      here_0.1              RcppAnnoy_0.0.16      plyr_1.8.6            magrittr_1.5          bookdown_0.19         R6_2.4.1              generics_0.0.2        DBI_1.1.0             withr_2.4.1           pillar_1.4.4          haven_2.3.1           mgcv_1.8-31           fitdistrplus_1.1-1    survival_3.1-12       abind_1.4-5           future.apply_1.5.0    modelr_0.1.8          crayon_1.3.4         
## [109] KernSmooth_2.23-17    plotly_4.9.2.1        rmarkdown_2.2         readxl_1.3.1          grid_3.6.0            data.table_1.12.8     blob_1.2.1            reprex_0.3.0          digest_0.6.20         xtable_1.8-4          httpuv_1.5.3.1        munsell_0.5.0         viridisLite_0.3.0